Primary Vignette - Classification Strategies
Objectives
Perform exploratory data analysis
Reduce dimensionality using principal component analysis
Employ logistic regression using
glm()andmultinom()Building a random forest model using cross-validation
We’ll illustrate these strategies using the California Household Travel Survey (CHTS) dataset.
# Loading necessary packages
library(tidyverse)
library(ggplot2)
library(dplyr)
library(tidymodels)
library(sparsesvd)
library(nnet)
library(sf)
library(mapview)
mapviewOptions(fgb = FALSE)
library(leafsync)
library(maps)
library(nnet)
library(randomForest)
library(vip)
library(SparseM)
library(Matrix)
# Read in datasets
PersonData <- read_rds('./Data/raw/PersonData_111A.Rds')
HHData <- read_rds('./Data/raw/HHData_111A.Rds')
hh_bgDensity <- read_rds('./Data/raw/hh_bgDensity.Rds')
county_shp <- st_read("./Data/raw/counties/counties.shp")Reading layer `counties' from data source
`C:\Users\immai\Downloads\pstat197\vignette-group3\Data\raw\counties\counties.shp'
using driver `ESRI Shapefile'
Simple feature collection with 58 features and 17 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -124.3926 ymin: 32.53578 xmax: -114.1312 ymax: 42.00952
Geodetic CRS: WGS 84
# Merge datasets
personHHData <- left_join(PersonData, HHData) %>%
left_join(hh_bgDensity)
data <- personHHData %>%
filter(WorkDaysWk != 8 , WorkDaysWk !=9,
TypicalHoursWk != 998, TypicalHoursWk!= 999,
TransitTripsWk != 98, TransitTripsWk != 99,
WalkTripsWk != 98, WalkTripsWk != 99,
BikeTripsWk != 98, BikeTripsWk != 99) %>%
mutate(HH_anyTransitRider = as.factor(HH_anyTransitRider),
HH_homeType = as.factor(HH_homeType),
HH_homeowner = as.factor(HH_homeowner),
HH_isHispanic = as.factor(HH_isHispanic),
HH_intEnglish = as.factor(HH_intEnglish),
HH_income = as.factor(HH_income),
Male = as.factor(Male),
persWhite = as.factor(persWhite),
persHisp = as.factor(persHisp),
persAfricanAm = as.factor(persAfricanAm),
persNativeAm = as.factor(persNativeAm),
persAsian = as.factor(persAsian),
persPacIsl = as.factor(persPacIsl),
persOthr = as.factor(persOthr),
persDKrace = as.factor(persDKrace),
persRFrace = as.factor(persRFrace),
bornUSA = as.factor(bornUSA),
DriverLic = as.factor(DriverLic),
TransitPass = as.factor(TransitPass),
Employed = as.factor(Employed),
WorkFixedLoc = as.factor(WorkFixedLoc),
WorkHome =as.factor(WorkHome),
WorkNonfixed = as.factor(WorkNonfixed),
FlexSched = as.factor(FlexSched),
FlexPrograms = as.factor(FlexPrograms),
Disability = as.factor(Disability),
DisLicensePlt = as.factor(DisLicensePlt),
Student = as.factor(Student),
workday = as.factor(workday),
hhid = paste(hhid, pnum),
SchoolMode = as.factor(SchoolMode),
WorkMode = as.factor(WorkMode),
EducationCompl = as.factor(EducationCompl)) %>%
select(-pnum)
# Preprocess data
numeric_columns <- sapply(data, is.numeric)
numeric_data <- data[, numeric_columns]
numeric_data <- numeric_data %>% select(-CTFIP, -bg_density)
scaled_data <- scale(numeric_data)
hhid <- data$hhid
bg_group <- as.data.frame(data$bg_group)
scaled_data <- cbind(hhid, bg_group, scaled_data) %>%
mutate(bg_group = data$bg_group)
scaled_data_clean <- na.omit(scaled_data) %>%
as.data.frame() %>%
select(-`data$bg_group`)Exploratory Data Analysis
Before we partition our data and fit classification models, we want to get to know our data through some exploratory visualizations. Since our data is geographically structured, we found it helpful to visualize our data on a map.
Static Map
county_bg_aggreg <- data %>%
group_by(County, CTFIP, bg_group) %>% # group by county, CTFIP, and also bg_group
mutate(count = n()) %>%
summarise_at(vars(-hhid), mean)
county_bg_shp <- county_shp %>%
merge(data.frame(bg_group = c("Urban", "Suburban", "Exurban", "Rural"))) %>%
left_join(county_bg_aggreg)
# get the CA county data
county <- ggplot2::map_data("county", region = "california")
county_bg <- merge(county, data.frame(bg_group = c("Urban", "Suburban", "Exurban", "Rural")))
county_bg_all <- county_bg_aggreg %>%
mutate(subregion = tolower(County)) %>%
full_join(county_bg, by = c("subregion", "bg_group"))
ggplot(county_bg_all) +
geom_polygon(aes(x = long, y = lat, group = subregion, fill = Sum_PMT), colour = "white") +
scale_fill_distiller(palette = "YlGnBu", direction = 1) +
facet_wrap(vars(bg_group), nrow = 2) + # multi-panel plots using facet_wrap(), plot in 2 rows
ggtitle("Total PMT in California at County-level") +
theme_void() +
theme(legend.position="bottom")
These maps show the total distance traveled by country. The grey counties represent missing data.
Sum Trips by Residential Area
urban_TripMap <- mapview(filter(county_bg_shp, bg_group == "Urban"),
zcol = "Sum_Trips", legend = TRUE, popup = NULL,
layer.name = "Urban Trips")
suburb_TripMap <- mapview(filter(county_bg_shp, bg_group == "Suburban"),
zcol = "Sum_Trips", legend = TRUE, popup = NULL,
layer.name = "Suburban Trips")
exurb_TripMap <- mapview(filter(county_bg_shp, bg_group == "Exurban"),
zcol = "Sum_Trips", legend = TRUE, popup = NULL,
layer.name = "Exurban Trips")
rural_TripMap <- mapview(filter(county_bg_shp, bg_group == "Rural"),
zcol = "Sum_Trips", legend = TRUE, popup = NULL,
layer.name = "Rural Trips")
latticeview(urban_TripMap, suburb_TripMap, exurb_TripMap, rural_TripMap, sync = "all")